# coding = utf-8 

'''
This code file pulls the 421a benefit types from assessment records.
'''


# Import libraries
from pdfminer3.layout import LAParams, LTTextBox
from pdfminer3.pdfpage import PDFPage
from pdfminer3.pdfinterp import PDFResourceManager
from pdfminer3.pdfinterp import PDFPageInterpreter
from pdfminer3.converter import PDFPageAggregator, TextConverter

import os
import io
import pandas as pd
import numpy as np
import urllib.request 
import time
import multiprocessing
import re



# Set working directory
path = "/Users/esoltas/Dropbox (MIT)/Research/NYC421a/data/raw/pdfs/"
os.chdir(path)

# Read in list of BBLs
bbls = pd.read_csv('bbls_to_scrape.csv')
bbls = bbls.bbl

# List of dates (redundant ones: 20140525,20150525)
dates = [20190115]
results = []

# Download PDF of assessment for a specific BBL and date
def download_file(bbl,date):

	time.sleep(15)

	url = 'https://a836-mspuvw-dofptsz.nyc.gov/PTSCM/StatementSearch?bbl='+str(bbl)+'&stmtDate='+str(date)+'&stmtType=ASR'

	response = urllib.request.urlopen(url)

	filename = str(bbl)+"_"+str(date)+".pdf"

	file = open(filename, 'wb')
	file.write(response.read())
	file.close()

# Delete the PDF
def delete_file(bbl,date):
	filename = str(bbl)+"_"+str(date)+".pdf"
	os.remove(filename)

# Parse a PDF
def parsePDF(bbl,date):

	filename = str(bbl)+"_"+str(date)+".pdf"

	# Setup for PDF Miner
	resource_manager = PDFResourceManager()
	fake_file_handle = io.StringIO()
	converter = TextConverter(resource_manager, fake_file_handle)
	page_interpreter = PDFPageInterpreter(resource_manager, converter)

	with open(filename, 'rb') as fh:
		for page in PDFPage.get_pages(fh,
									  caching=True,
									  check_extractable=True):
			page_interpreter.process_page(page)
		text = fake_file_handle.getvalue()

	converter.close()
	fake_file_handle.close()

	return text

# Define 421a flags
def detect421a(text):

	flag10uncap = text.find("•5110")>0
	flag15uncap = text.find("•5113")>0
	flag25uncap = text.find("•5114")>0
	flag20uncap = text.find("•5116")>0
	flag10cap = text.find("•5117")>0
	flag15cap = text.find("•5118")>0

	marketval_regex = re.compile(r"(?<=ESTIMATEDMARKETVALUE)[\d,]+")
	billableav_regex = re.compile(r"(?<=WILL BE BASED ON)[\d,]+")

	marketval_str = marketval_regex.search(text).group(0)
	billableav_str = billableav_regex.search(text).group(0)

	marketval = int(marketval_str.replace(',', ''))
	billableav = int(billableav_str.replace(',', ''))

	detect = [flag10uncap,flag15uncap,flag25uncap,flag20uncap,flag10cap,flag15cap,marketval,billableav]

	return detect

# Check all dates 
def checkdates(bbl):

	# Initialize row
	row = [False,False,False,False,False,False,0,0]

	for date in dates:

		download_file(bbl,date)

		try:
			text = parsePDF(bbl,date)
			row_tmp = detect421a(text)

		except:
			row_tmp = [False,False,False,False,False,False,0,0]

		delete_file(bbl,date)

		# Take maximum (looking to detect 421a across assessment dates)
		row = list(np.maximum(row,row_tmp))

	row = [bbl] + row
	print(row)

	return row


for b in bbls:

	row = checkdates(b)
	results.extend(row)

df = pd.DataFrame(results,columns=['bbl','flag10uncap','flag15uncap',
	'flag25uncap','flag20uncap','flag10cap','flag15cap','marketval','billableav'])

# Save to CSV
df.to_csv("scrape.csv",index=False)




def collect_results(row):
	results.extend(row)

def main():

	pool = multiprocessing.Pool(processes=multiprocessing.cpu_count())
	pool.map_async(checkdates, bbls, callback=collect_results)
	pool.close()
	pool.join()

	# Save as pandas df
	df = pd.DataFrame(results,columns=['bbl','flag10uncap','flag15uncap',
		'flag25uncap','flag20uncap','flag10cap','flag15cap','marketval','billableav'])

	# Save to CSV
	df.to_csv("scrape.csv",index=False)

		
if __name__ == "__main__":
	main()
